library(devtools);
## Loading required package: usethis
library(humanVerseWSU);
path.github = "https://raw.githubusercontent.com/MonteShaffer/humanVerseWSU/master/";
include.me = paste0(path.github, "misc/functions-nlp.R");
source_url( include.me );
## SHA-1 hash of file is d411302cf0069ed51cc5dfb8c1163a7869329eed
## Loading required package: NLP
## Warning: package 'SentimentAnalysis' was built under R version 4.0.3
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
include.me = paste0(path.github, "misc/functions-nlp-str.R");
source_url( include.me );
## SHA-1 hash of file is 54604c6aba08045f7965d86773bf15d7288a67fe
include.me = paste0(path.github, "misc/functions-nlp-stack.R");
source_url( include.me );
## SHA-1 hash of file is fb9b84a774f8d46c97db92eabfc7ae778b49cfeb
include.me = paste0(path.github, "misc/functions-nlp-pos.R");
source_url( include.me );
## SHA-1 hash of file is c4185150c8c43a59e88093872bef9c0fb4bc1dc5
## Warning: package 'openNLP' was built under R version 4.0.3
include.me = paste0(path.github, "humanVerseWSU/R/functions-encryption.R");
source_url( include.me );
## SHA-1 hash of file is bb54547500147dcb77047053bbf5c4bd153ffc38
path.to.nascent = "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/";
folder.nlp = "nlp/";
path.to.nlp = paste0(path.to.nascent, folder.nlp);
########## load data ##########
gutenberg.id = 2591;
path.to.gutenberg = paste0(path.to.nlp,"_data_/gutenberg/");
createDirRecursive(path.to.gutenberg);
path.to.grimm = paste0(path.to.gutenberg,gutenberg.id,"/");
createDirRecursive(path.to.grimm);
local.data.path = path.to.gutenberg; # currently required by grabHTML ... TODO: fix
txt.file.remote = "https://www.gutenberg.org/files/2591/2591-0.txt";
html.file.remote = "https://www.gutenberg.org/files/2591/2591-h/2591-h.htm";
df.grimm = parseGutenberg.GRIMM(path.to.grimm,
file.stem = "fairytales",
txt.file.remote = txt.file.remote,
html.file.remote =html.file.remote,
my.local.path = path.to.gutenberg);
## [1] "grabHTML() ... from cache ... C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/fairytales.txt"
## [1] "grabHTML() ... from cache ... C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/fairytales.html"
## [1] "Time elapsed: 0.01"
# df.grimm;
########## load stopwords ##########
stop.file.snowball = paste0(path.to.nlp, "stop-templates/snowball.txt");
stop.snowball = trimMe(strsplit( grabHTML(stop.file.snowball), "\r\n")[[1]]);
## [1] "grabHTML() ... from cache ... C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/stop-templates/snowball.txt"
To demonstrate how we can utilize the features, we will examine three basic sets of features (we generated a lot more than this in the last notebook):
We are choosing to analyze each story as its own unit of analysis. We could analyze each paragraph or sentence if we wanted to get into the minutia. Since the Brothers GRIMM compiled these stories, maybe we will find correlations among the stories that may suggest a single author wrote one or more stories (linguistic correlations) from which the GRIMM brothers utilized to place in their compilation.
## syllables issue
# one = prepareOneStory(df.grimm, path.to.grimm,
# title = "THE BROTHERS GRIMM FAIRY TALES",
# title.f = "THE.BROTHERS.GRIMM.FAIRY.TALES",
# my.stopwords = stop.snowball
# );
# if you have the data cached, this will be fast
my.df = summarizeGeneral(which="ALL",
df.grimm, path.to.grimm,
my.stopwords = stop.snowball
);
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.BROTHERS.GRIMM.FAIRY.TALES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.BROTHERS.GRIMM.FAIRY.TALES ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOLDEN.BIRD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOLDEN.BIRD ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/HANS.IN.LUCK-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... HANS.IN.LUCK ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/JORINDA.AND.JORINDEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... JORINDA.AND.JORINDEL ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TRAVELLING.MUSICIANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TRAVELLING.MUSICIANS ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/OLD.SULTAN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... OLD.SULTAN ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.STRAW.THE.COAL.AND.THE.BEAN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.STRAW.THE.COAL.AND.THE.BEAN ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/BRIAR.ROSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... BRIAR.ROSE ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.DOG.AND.THE.SPARROW-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.DOG.AND.THE.SPARROW ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TWELVE.DANCING.PRINCESSES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TWELVE.DANCING.PRINCESSES ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FISHERMAN.AND.HIS.WIFE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FISHERMAN.AND.HIS.WIFE ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WILLOW.WREN.AND.THE.BEAR-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WILLOW.WREN.AND.THE.BEAR ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FROG.PRINCE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FROG.PRINCE ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CAT.AND.MOUSE.IN.PARTNERSHIP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CAT.AND.MOUSE.IN.PARTNERSHIP ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOOSE.GIRL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOOSE.GIRL ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ADVENTURES.OF.CHANTICLEER.AND.PARTLET-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ADVENTURES.OF.CHANTICLEER.AND.PARTLET ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/RAPUNZEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... RAPUNZEL ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/FUNDEVOGEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... FUNDEVOGEL ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.VALIANT.LITTLE.TAILOR-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.VALIANT.LITTLE.TAILOR ... [--TOTAL--] in 0.13 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/HANSEL.AND.GRETEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... HANSEL.AND.GRETEL ... [--TOTAL--] in 0.15 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.MOUSE.THE.BIRD.AND.THE.SAUSAGE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.MOUSE.THE.BIRD.AND.THE.SAUSAGE ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/MOTHER.HOLLE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... MOTHER.HOLLE ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/LITTLE.RED.CAP.[LITTLE.RED.RIDING.HOOD]-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... LITTLE.RED.CAP.[LITTLE.RED.RIDING.HOOD] ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ROBBER.BRIDEGROOM-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ROBBER.BRIDEGROOM ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/TOM.THUMB-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... TOM.THUMB ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/RUMPELSTILTSKIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... RUMPELSTILTSKIN ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.GRETEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.GRETEL ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.OLD.MAN.AND.HIS.GRANDSON-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.OLD.MAN.AND.HIS.GRANDSON ... [--TOTAL--] in 0.01 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.LITTLE.PEASANT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.LITTLE.PEASANT ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/FREDERICK.AND.CATHERINE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... FREDERICK.AND.CATHERINE ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SWEETHEART.ROLAND-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SWEETHEART.ROLAND ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SNOWDROP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SNOWDROP ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.PINK-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.PINK ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.ELSIE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.ELSIE ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.MISER.IN.THE.BUSH-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.MISER.IN.THE.BUSH ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/ASHPUTTEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... ASHPUTTEL ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WHITE.SNAKE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WHITE.SNAKE ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WOLF.AND.THE.SEVEN.LITTLE.KIDS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WOLF.AND.THE.SEVEN.LITTLE.KIDS ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.QUEEN.BEE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.QUEEN.BEE ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ELVES.AND.THE.SHOEMAKER-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ELVES.AND.THE.SHOEMAKER ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.JUNIPER.TREE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.JUNIPER.TREE ... [--TOTAL--] in 0.12 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TURNIP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TURNIP ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.HANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.HANS ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.THREE.LANGUAGES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.THREE.LANGUAGES ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOX.AND.THE.CAT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOX.AND.THE.CAT ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOUR.CLEVER.BROTHERS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOUR.CLEVER.BROTHERS ... [--TOTAL--] in 0.1 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/LILY.AND.THE.LION-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... LILY.AND.THE.LION ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOX.AND.THE.HORSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOX.AND.THE.HORSE ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.BLUE.LIGHT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.BLUE.LIGHT ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.RAVEN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.RAVEN ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOLDEN.GOOSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOLDEN.GOOSE ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WATER.OF.LIFE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WATER.OF.LIFE ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TWELVE.HUNTSMEN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TWELVE.HUNTSMEN ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.KING.OF.THE.GOLDEN.MOUNTAIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.KING.OF.THE.GOLDEN.MOUNTAIN ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/DOCTOR.KNOWALL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... DOCTOR.KNOWALL ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.SEVEN.RAVENS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.SEVEN.RAVENS ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WEDDING.OF.MRS.FOX-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WEDDING.OF.MRS.FOX ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.SALAD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.SALAD ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.STORY.OF.THE.YOUTH.WHO.WENT.FORTH.TO.LEARN.WHAT.FEAR.WAS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.STORY.OF.THE.YOUTH.WHO.WENT.FORTH.TO.LEARN.WHAT.FEAR.WAS ... [--TOTAL--] in 0.15 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/KING.GRISLY.BEARD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... KING.GRISLY.BEARD ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/IRON.HANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... IRON.HANS ... [--TOTAL--] in 0.13 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CAT.SKIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CAT.SKIN ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SNOW.WHITE.AND.ROSE.RED-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SNOW.WHITE.AND.ROSE.RED ... [--TOTAL--] in 0.09 secs"
my.df;
Scaling certain columns as necessary.
my.df[,5:7] = my.df[,5:7]/my.df[,4];
my.df$P.total = rowSums(my.df[,14:22]);
my.df[,15:22] = my.df[,15:22]/my.df$P.total;
my.df[,24:26] = my.df[,24:26]/my.df[,23];
my.df[,27:28] = my.df[,27:28]/my.df[,23];
my.df[,30:31] = my.df[,30:31]/my.df[,29];
my.df[,32:33] = my.df[,32:33]/my.df[,29];
rownames(my.df) = my.df$title;
include.me = paste0(path.github, "humanVerseWSU/R/functions-EDA.R");
source_url( include.me );
## SHA-1 hash of file is 62ba3333da32792e57c410e3f02a443a4c7f4985
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
do.nothing = perform.hclust(my.df[,c(-2)], 8, plot.grid=1);
## Registered S3 method overwritten by 'dendextend':
## method from
## text.pvclust pvclust
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
# do.nothing = perform.hclust(t(my.df[,c(-2)]), 6, plot.grid=1);
SVD … orthogonal factors … PCA … eigen
rownames(my.df) = NULL;
X = my.df[,c(-2)];
Xs = scale(X);
Xs.how.many = howManyFactorsToSelect(X);
## [1] " Paralell Analysis"
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## In factor.scores, the correlation matrix is singular, an approximation is used
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Parallel analysis suggests that the number of factors = 6 and the number of components = NA
## [1] "============================================="
## [1] " VSS Analysis"
## Warning in sqrt(e$values): NaNs produced
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(R): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## [1] "************************"
## [1] " Eigenvalues >= 1 ... [ n = 7 ]"
## [1] 9.173239 7.270019 4.397535 2.576206 2.210343 2.155448 1.274959
## [1] "************************"
## [1] "A 3-Factor solution has the most votes!"
## [1] "A 4-Factor solution has the most votes!"
## [1] "A 6-Factor solution has the most votes!"
## [1] "A 7-Factor solution has the most votes!"
## [1] ""
## [1] "************************"
## [1] " Final Analysis of VSS, Eigen, nFactors"
## Factor vote.count
## 1 1 1
## 2 2 1
## 3 3 2
## 4 4 2
## 5 5 1
## 6 6 2
## 7 7 2
## [1] ""
Xs.svd = svd(X, nu = 6, nv = 6);
# plot(my.df.svd$d);
Xs.princomp = stats::princomp(X);
summary(Xs.princomp);
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1528.0923244 78.416880934 59.19327553 3.757021e+01
## Proportion of Variance 0.9946996 0.002619461 0.00149258 6.012846e-04
## Cumulative Proportion 0.9946996 0.997319023 0.99881160 9.994129e-01
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 3.090471e+01 1.690904e+01 8.682077e+00 6.336147e+00
## Proportion of Variance 4.068573e-04 1.217954e-04 3.210998e-05 1.710185e-05
## Cumulative Proportion 9.998197e-01 9.999415e-01 9.999736e-01 9.999908e-01
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 4.414859e+00 1.275923e+00 6.534212e-01 3.443032e-01
## Proportion of Variance 8.302835e-06 6.934924e-07 1.818776e-07 5.049808e-08
## Cumulative Proportion 9.999991e-01 9.999997e-01 9.999999e-01 1.000000e+00
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 1.481396e-01 9.580105e-02 8.043311e-02 5.635891e-02
## Proportion of Variance 9.348352e-09 3.909609e-09 2.755894e-09 1.353063e-09
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 4.737834e-02 3.549784e-02 2.475738e-02 1.957271e-02
## Proportion of Variance 9.562081e-10 5.367803e-10 2.610970e-10 1.631904e-10
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.21 Comp.22 Comp.23 Comp.24
## Standard deviation 1.764256e-02 1.235367e-02 1.034062e-02 8.272353e-03
## Proportion of Variance 1.325916e-10 6.501067e-11 4.554977e-11 2.915083e-11
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## Comp.25 Comp.26 Comp.27 Comp.28
## Standard deviation 2.840654e-03 2.293402e-03 1.63145e-05 4.662909e-06
## Proportion of Variance 3.437396e-12 2.240541e-12 1.13381e-16 9.262040e-18
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.00000e+00 1.000000e+00
## Comp.29 Comp.30 Comp.31 Comp.32 Comp.33
## Standard deviation 2.725119e-07 1.387901e-09 1.018106e-09 0 0
## Proportion of Variance 3.163469e-20 8.205589e-25 4.415486e-25 0 0
## Cumulative Proportion 1.000000e+00 1.000000e+00 1.000000e+00 1 1
## Comp.34 Comp.35
## Standard deviation 0 0
## Proportion of Variance 0 0
## Cumulative Proportion 1 1
biplot(Xs.princomp, 3:4);
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
library(lsa);
## Warning: package 'lsa' was built under R version 4.0.3
## Loading required package: SnowballC
# cosine(x);
Xs.svd = svd(X, nu = 6, nv = 6);
Xs.cos.features = ( round( cosine(Xs.svd$u),2 ) );
as.data.frame(Xs.cos.features);
Xs.cos.stories = ( round( cosine(t(Xs.svd$u)),2 ) );
rownames(Xs.cos.stories) = my.df$title;
colnames(Xs.cos.stories) = my.df$title;
as.data.frame(Xs.cos.stories);
do.nothing = perform.hclust(X, 6, dist.method = Xs.cos.stories, dist.p="cosine", plot.grid=1);
## [1] "Pruning 1 of 6"
## [1] "Pruning 2 of 6"
## [1] "Pruning 3 of 6"
## [1] "Pruning 4 of 6"
## [1] "Pruning 5 of 6"
## [1] "Pruning 6 of 6"
my.df = summarizeCustom(which="ALL", df.grimm);
my.df;
This matrix is called a term-frequency matrix. The rows are the “stories,” the columns are the “words” or features. Within a cell element, is the number of occurrences (the frequency) of words. We could truncate and remove values with lots of zeroes, by reviewing column sums.
When we scale this type of data, there are two elements that matter, the “term frequency” (tf) and the “document frequency” (df). We will update the matrix to account for these options. See https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
The overall scaling is called the tf-idf (term frequency - inverse document frequency):
We could do a binary result (0, 1); we are currently doing the raw count. The easiest transform is the “log normalization” and I recommend using this option.
X = as.matrix(my.df);
# we have and/or in this list ... it will dominate ... it would be useful as an isolated analysis ... but let's keep it all in for now ...
library(matrixStats);
words.in.doc = rowSums2(X);
maxfreq.in.doc = rowMaxs(X);
n.word = ncol(X);
n.docs = nrow(X);
tf.raw = X;
tf.bin = X;
tf.bin[X > 0] = 1;
tf.s = X / words.in.doc;
tf.logn = log(1 + X);
tf.logn2 = 0.5 + 0.5 * X / maxfreq.in.doc;
What is the commonality of the word across all documents.
X = as.matrix(my.df);
term.in.corpus = colCounts(X);
n.t = 1 + term.in.corpus;
idf.un = 1;
idf.s = log(n.docs / n.t);
idf.smooth = log(n.docs / (1 + n.t)) + 1;
idx.prob = log( (n.docs - n.t) / n.t);
X.tf.logn.idf.s = tf.logn / idf.s;
X.tf.s.idf.un = tf.s / idf.un;
Clearly the scaling is changing the results.
FAIRY TALES (the intro) should be an isolate.
Keeping “and” is going to influence the results.
do.nothing = perform.hclust(X, 8, plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
Xs.svd = svd(X, nu = 8, nv = 8);
Xs.cos.stories = ( round( cosine(t(Xs.svd$u)),2 ) );
rownames(Xs.cos.stories) = my.df$title;
colnames(Xs.cos.stories) = my.df$title;
as.data.frame(Xs.cos.stories);
do.nothing = perform.hclust(X, 8, dist.method = Xs.cos.stories, dist.p="cosine", plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
do.nothing = perform.hclust(X.tf.logn.idf.s, 8, plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
Xs.svd = svd(X.tf.logn.idf.s, nu = 8, nv = 8);
Xs.cos.stories = ( round( cosine(t(Xs.svd$u)),2 ) );
rownames(Xs.cos.stories) = my.df$title;
colnames(Xs.cos.stories) = my.df$title;
as.data.frame(Xs.cos.stories);
do.nothing = perform.hclust(X.tf.logn.idf.s, 8, dist.method = Xs.cos.stories, dist.p="cosine", plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
do.nothing = perform.hclust(X.tf.s.idf.un, 8, plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
Xs.svd = svd(X.tf.s.idf.un, nu = 8, nv = 8);
Xs.cos.stories = ( round( cosine(t(Xs.svd$u)),2 ) );
rownames(Xs.cos.stories) = my.df$title;
colnames(Xs.cos.stories) = my.df$title;
as.data.frame(Xs.cos.stories);
do.nothing = perform.hclust(X.tf.s.idf.un, 8, dist.method = Xs.cos.stories, dist.p="cosine", plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
We will now example some linguistic features, a string of word “.tags” [I need to update the caching mechanism to cache based on the “n-grams” analyzed, I did 5 for this setup.]
my.df = summarizeMatrix.POS(which="ALL", df.grimm,
path.to.grimm, my.stopwords = stop.snowball,
nfeature=".tags", ngrams=5 );
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.BROTHERS.GRIMM.FAIRY.TALES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.BROTHERS.GRIMM.FAIRY.TALES ... [--TOTAL--] in 0.01 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOLDEN.BIRD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOLDEN.BIRD ... [--TOTAL--] in 0.1 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/HANS.IN.LUCK-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... HANS.IN.LUCK ... [--TOTAL--] in 0.11 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/JORINDA.AND.JORINDEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... JORINDA.AND.JORINDEL ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TRAVELLING.MUSICIANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TRAVELLING.MUSICIANS ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/OLD.SULTAN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... OLD.SULTAN ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.STRAW.THE.COAL.AND.THE.BEAN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.STRAW.THE.COAL.AND.THE.BEAN ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/BRIAR.ROSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... BRIAR.ROSE ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.DOG.AND.THE.SPARROW-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.DOG.AND.THE.SPARROW ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TWELVE.DANCING.PRINCESSES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TWELVE.DANCING.PRINCESSES ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FISHERMAN.AND.HIS.WIFE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FISHERMAN.AND.HIS.WIFE ... [--TOTAL--] in 0.11 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WILLOW.WREN.AND.THE.BEAR-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WILLOW.WREN.AND.THE.BEAR ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FROG.PRINCE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FROG.PRINCE ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CAT.AND.MOUSE.IN.PARTNERSHIP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CAT.AND.MOUSE.IN.PARTNERSHIP ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOOSE.GIRL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOOSE.GIRL ... [--TOTAL--] in 0.11 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ADVENTURES.OF.CHANTICLEER.AND.PARTLET-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ADVENTURES.OF.CHANTICLEER.AND.PARTLET ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/RAPUNZEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... RAPUNZEL ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/FUNDEVOGEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... FUNDEVOGEL ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.VALIANT.LITTLE.TAILOR-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.VALIANT.LITTLE.TAILOR ... [--TOTAL--] in 0.15 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/HANSEL.AND.GRETEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... HANSEL.AND.GRETEL ... [--TOTAL--] in 0.14 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.MOUSE.THE.BIRD.AND.THE.SAUSAGE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.MOUSE.THE.BIRD.AND.THE.SAUSAGE ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/MOTHER.HOLLE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... MOTHER.HOLLE ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/LITTLE.RED.CAP.[LITTLE.RED.RIDING.HOOD]-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... LITTLE.RED.CAP.[LITTLE.RED.RIDING.HOOD] ... [--TOTAL--] in 0.16 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ROBBER.BRIDEGROOM-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ROBBER.BRIDEGROOM ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/TOM.THUMB-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... TOM.THUMB ... [--TOTAL--] in 0.11 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/RUMPELSTILTSKIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... RUMPELSTILTSKIN ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.GRETEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.GRETEL ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.OLD.MAN.AND.HIS.GRANDSON-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.OLD.MAN.AND.HIS.GRANDSON ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.LITTLE.PEASANT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.LITTLE.PEASANT ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/FREDERICK.AND.CATHERINE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... FREDERICK.AND.CATHERINE ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SWEETHEART.ROLAND-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SWEETHEART.ROLAND ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SNOWDROP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SNOWDROP ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.PINK-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.PINK ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.ELSIE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.ELSIE ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.MISER.IN.THE.BUSH-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.MISER.IN.THE.BUSH ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/ASHPUTTEL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... ASHPUTTEL ... [--TOTAL--] in 0.11 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WHITE.SNAKE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WHITE.SNAKE ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WOLF.AND.THE.SEVEN.LITTLE.KIDS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WOLF.AND.THE.SEVEN.LITTLE.KIDS ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.QUEEN.BEE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.QUEEN.BEE ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.ELVES.AND.THE.SHOEMAKER-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.ELVES.AND.THE.SHOEMAKER ... [--TOTAL--] in 0.1 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.JUNIPER.TREE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.JUNIPER.TREE ... [--TOTAL--] in 0.14 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TURNIP-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TURNIP ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CLEVER.HANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CLEVER.HANS ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.THREE.LANGUAGES-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.THREE.LANGUAGES ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOX.AND.THE.CAT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOX.AND.THE.CAT ... [--TOTAL--] in 0.02 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOUR.CLEVER.BROTHERS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOUR.CLEVER.BROTHERS ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/LILY.AND.THE.LION-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... LILY.AND.THE.LION ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.FOX.AND.THE.HORSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.FOX.AND.THE.HORSE ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.BLUE.LIGHT-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.BLUE.LIGHT ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.RAVEN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.RAVEN ... [--TOTAL--] in 0.1 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.GOLDEN.GOOSE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.GOLDEN.GOOSE ... [--TOTAL--] in 0.06 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WATER.OF.LIFE-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WATER.OF.LIFE ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.TWELVE.HUNTSMEN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.TWELVE.HUNTSMEN ... [--TOTAL--] in 0.05 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.KING.OF.THE.GOLDEN.MOUNTAIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.KING.OF.THE.GOLDEN.MOUNTAIN ... [--TOTAL--] in 0.14 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/DOCTOR.KNOWALL-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... DOCTOR.KNOWALL ... [--TOTAL--] in 0.03 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.SEVEN.RAVENS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.SEVEN.RAVENS ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.WEDDING.OF.MRS.FOX-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.WEDDING.OF.MRS.FOX ... [--TOTAL--] in 0.04 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.SALAD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.SALAD ... [--TOTAL--] in 0.08 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/THE.STORY.OF.THE.YOUTH.WHO.WENT.FORTH.TO.LEARN.WHAT.FEAR.WAS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... THE.STORY.OF.THE.YOUTH.WHO.WENT.FORTH.TO.LEARN.WHAT.FEAR.WAS ... [--TOTAL--] in 0.16 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/KING.GRISLY.BEARD-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... KING.GRISLY.BEARD ... [--TOTAL--] in 0.07 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/IRON.HANS-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... IRON.HANS ... [--TOTAL--] in 0.16 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/CAT.SKIN-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... CAT.SKIN ... [--TOTAL--] in 0.09 secs"
## [1] "C:/Users/Alexander Nevsky/Dropbox/WSU-419/Fall 2020/__student_access__/unit_02_confirmatory_data_analysis/nascent/nlp/_data_/gutenberg/2591/SNOW.WHITE.AND.ROSE.RED-6043f89979332674fc7969bd9af02e59.rds"
## [1] "story: ... SNOW.WHITE.AND.ROSE.RED ... [--TOTAL--] in 0.19 secs"
## [1] "THE BROTHERS GRIMM FAIRY TALES"
## [1] "THE GOLDEN BIRD"
## [1] "HANS IN LUCK"
## [1] "JORINDA AND JORINDEL"
## [1] "THE TRAVELLING MUSICIANS"
## [1] "OLD SULTAN"
## [1] "THE STRAW, THE COAL, AND THE BEAN"
## [1] "BRIAR ROSE"
## [1] "THE DOG AND THE SPARROW"
## [1] "THE TWELVE DANCING PRINCESSES"
## [1] "THE FISHERMAN AND HIS WIFE"
## [1] "THE WILLOW-WREN AND THE BEAR"
## [1] "THE FROG-PRINCE"
## [1] "CAT AND MOUSE IN PARTNERSHIP"
## [1] "THE GOOSE-GIRL"
## [1] "THE ADVENTURES OF CHANTICLEER AND PARTLET"
## [1] "RAPUNZEL"
## [1] "FUNDEVOGEL"
## [1] "THE VALIANT LITTLE TAILOR"
## [1] "HANSEL AND GRETEL"
## [1] "THE MOUSE, THE BIRD, AND THE SAUSAGE"
## [1] "MOTHER HOLLE"
## [1] "LITTLE RED-CAP [LITTLE RED RIDING HOOD]"
## [1] "THE ROBBER BRIDEGROOM"
## [1] "TOM THUMB"
## [1] "RUMPELSTILTSKIN"
## [1] "CLEVER GRETEL"
## [1] "THE OLD MAN AND HIS GRANDSON"
## [1] "THE LITTLE PEASANT"
## [1] "FREDERICK AND CATHERINE"
## [1] "SWEETHEART ROLAND"
## [1] "SNOWDROP"
## [1] "THE PINK"
## [1] "CLEVER ELSIE"
## [1] "THE MISER IN THE BUSH"
## [1] "ASHPUTTEL"
## [1] "THE WHITE SNAKE"
## [1] "THE WOLF AND THE SEVEN LITTLE KIDS"
## [1] "THE QUEEN BEE"
## [1] "THE ELVES AND THE SHOEMAKER"
## [1] "THE JUNIPER-TREE"
## [1] "THE TURNIP"
## [1] "CLEVER HANS"
## [1] "THE THREE LANGUAGES"
## [1] "THE FOX AND THE CAT"
## [1] "THE FOUR CLEVER BROTHERS"
## [1] "LILY AND THE LION"
## [1] "THE FOX AND THE HORSE"
## [1] "THE BLUE LIGHT"
## [1] "THE RAVEN"
## [1] "THE GOLDEN GOOSE"
## [1] "THE WATER OF LIFE"
## [1] "THE TWELVE HUNTSMEN"
## [1] "THE KING OF THE GOLDEN MOUNTAIN"
## [1] "DOCTOR KNOWALL"
## [1] "THE SEVEN RAVENS"
## [1] "THE WEDDING OF MRS FOX"
## [1] "THE SALAD"
## [1] "THE STORY OF THE YOUTH WHO WENT FORTH TO LEARN WHAT FEAR WAS"
## [1] "KING GRISLY-BEARD"
## [1] "IRON HANS"
## [1] "CAT-SKIN"
## [1] "SNOW-WHITE AND ROSE-RED"
dim(my.df);
## [1] 63 34662
We have lots of features, let’s truncate based on a minimum appearance of being in at least N of the documents.
X = as.matrix(my.df);
term.in.corpus = colCounts(X);
for(i in 1:10)
{
print(paste0("i: ",i," ---->", sum(term.in.corpus == i) ));
}
## [1] "i: 1 ---->25412"
## [1] "i: 2 ---->4143"
## [1] "i: 3 ---->1666"
## [1] "i: 4 ---->814"
## [1] "i: 5 ---->542"
## [1] "i: 6 ---->378"
## [1] "i: 7 ---->236"
## [1] "i: 8 ---->201"
## [1] "i: 9 ---->122"
## [1] "i: 10 ---->112"
# let's truncate to appearing 3 or more times
idx = which(term.in.corpus < 3);
X.trunc = X[,-c(idx)];
dim(X.trunc);
## [1] 63 4565
We are now in a more manageble term-feature space. However, this is still way to big to run hclust directly, so let’s build some tf-idf, and svd to analyze …
tf.s = X.trunc / words.in.doc;
idf.un = 1;
X.tf.s.idf.un = tf.s / idf.un;
# we could try values from a few (8) to hundreds
Xs.svd = svd(X.tf.s.idf.un, nu = 24, nv = 24);
Xs.cos.stories = ( round( cosine(t(Xs.svd$u)),2 ) );
rownames(Xs.cos.stories) = my.df$title;
colnames(Xs.cos.stories) = my.df$title;
as.data.frame(Xs.cos.stories);
do.nothing = perform.hclust(X.tf.s.idf.un, 8, dist.method = Xs.cos.stories, dist.p="cosine", plot.grid=1);
## [1] "Pruning 1 of 8"
## [1] "Pruning 2 of 8"
## [1] "Pruning 3 of 8"
## [1] "Pruning 4 of 8"
## [1] "Pruning 5 of 8"
## [1] "Pruning 6 of 8"
## [1] "Pruning 7 of 8"
## [1] "Pruning 8 of 8"
Similarity now would be linked to a string of 5 POS elements. They would share linguistic style, data reduced.
Little Red Riding Hood and White Snake?
– TODO – Change up the matrix of words in CUSTOM, and run that result.
1.2.2 Comments
Notice that data reduction cleans up some of the branch-isolation noise. I would consider updating the original vector list (e.g., remove “and”), and passing it into the function.
I created this list based on my perception of GRIMM stories. It is of note to recognize that the terms are common across all documents. In the last example, we have to create a common list of features and do some sorting and organizing.